Mathematics and Statistics Assignment 15

In [1]:
import numpy as np
import plotly.graph_objects as go
from scipy.stats import kurtosis, skew
from scipy.stats import anderson
import random
import statistics
from prettytable import PrettyTable

Question 1

For a given sequence (File: E. coli K-12 MG 1655, GneBank id: U00096), find the frequency of occurrence of each of the nucleotides A, T, G, C’s, dinucleotides, and trinucleotides. For example for sequence ATGCCG dinucleotides would be AT, TG, GC etc and trinucleotides would be ATG, TGC, GCC etc.

In [2]:
# Reading Input

ecoli = []

with open("./Seq_Data/ecoli_11kb.txt", "r") as f:
    f.readline()
    ecoli = list(f.readline().strip().upper());
In [3]:
nucleotides = {'A':0, 'G':0, 'C':0, 'T':0}
dinucleotides = {}
trinucleotides = {}

for i in nucleotides:
    for j in nucleotides:
        dinucleotides[i+j] = 0
        for k in nucleotides:
            trinucleotides[i+j+k] = 0
In [4]:
# Computing frequency

for i in range(len(ecoli)):
    nucleotides[ecoli[i]] = nucleotides[ecoli[i]] + 1
    if i+1 < len(ecoli):
        dinucleotides[ecoli[i]+ecoli[i+1]] = dinucleotides[ecoli[i]+ecoli[i+1]] + 1
        if i+2 < len(ecoli):
            trinucleotides[ecoli[i]+ecoli[i+1]+ecoli[i+2]] = trinucleotides[ecoli[i]+ecoli[i+1]+ecoli[i+2]] + 1

print("Nucleotide Frequency: ")
print(nucleotides)
print("Dinucleotide Frequency: ")
print(dinucleotides)
print("Trinucleotide Frequency: ")
print(trinucleotides)
Nucleotide Frequency: 
{'A': 2731, 'G': 2986, 'C': 2746, 'T': 2537}
Dinucleotide Frequency: 
{'AA': 900, 'AG': 620, 'AC': 622, 'AT': 589, 'GA': 752, 'GG': 662, 'GC': 876, 'GT': 696, 'CA': 638, 'CG': 865, 'CC': 612, 'CT': 631, 'TA': 441, 'TG': 838, 'TC': 636, 'TT': 621}
Trinucleotide Frequency: 
{'AAA': 292, 'AAG': 262, 'AAC': 211, 'AAT': 135, 'AGA': 166, 'AGG': 128, 'AGC': 198, 'AGT': 128, 'ACA': 116, 'ACG': 175, 'ACC': 191, 'ACT': 140, 'ATA': 95, 'ATG': 159, 'ATC': 206, 'ATT': 129, 'GAA': 320, 'GAG': 92, 'GAC': 152, 'GAT': 188, 'GGA': 119, 'GGG': 91, 'GGC': 238, 'GGT': 214, 'GCA': 203, 'GCG': 261, 'GCC': 172, 'GCT': 240, 'GTA': 149, 'GTG': 188, 'GTC': 142, 'GTT': 217, 'CAA': 157, 'CAG': 208, 'CAC': 143, 'CAT': 130, 'CGA': 209, 'CGG': 208, 'CGC': 223, 'CGT': 225, 'CCA': 147, 'CCG': 255, 'CCC': 102, 'CCT': 108, 'CTA': 74, 'CTG': 319, 'CTC': 115, 'CTT': 123, 'TAA': 131, 'TAG': 58, 'TAC': 116, 'TAT': 136, 'TGA': 258, 'TGG': 235, 'TGC': 216, 'TGT': 129, 'TCA': 172, 'TCG': 174, 'TCC': 147, 'TCT': 143, 'TTA': 123, 'TTG': 172, 'TTC': 173, 'TTT': 152}

Question 2

Plot the density of nucleotides in a sequence (File: E. coli K-12 MG 1655). Graphically display n-mer (for n=2 and 3) in the given sequence. (You may use Excel sheet for obtaining the plots).

In [5]:
# Plotting Density functions

fig = go.Figure()
fig.add_trace(go.Bar(x=list(nucleotides.keys()),
                y=[x/np.sum(list(nucleotides.values())) for x in list(nucleotides.values())],
                marker_color='rgb(55, 83, 109)'
                ))
fig.update_layout(
    title='Density of Nucleotides',
    xaxis_title_text="Nucleotide",
    yaxis_title_text="Density"
)
fig.show()

fig2 = go.Figure()
fig2.add_trace(go.Bar(x=list(dinucleotides.keys()),
                y=[x/np.sum(list(dinucleotides.values())) for x in list(dinucleotides.values())],
                marker_color='rgb(55, 83, 109)'
                ))
fig2.update_layout(
    title='Density of Dinucleotides',
    xaxis_title_text="Dinucleotide",
    yaxis_title_text="Density"
)
fig2.show()

fig3 = go.Figure()
fig3.add_trace(go.Bar(x=list(trinucleotides.keys()),
                y=[x/np.sum(list(trinucleotides.values())) for x in list(trinucleotides.values())],
                marker_color='rgb(55, 83, 109)'
                ))
fig3.update_layout(
    title='Density of Trinucleotides',
    xaxis_title_text="Trinucleotide",
    yaxis_title_text="Density"
)
fig3.show()

Question 3

Consider the output generated by Program 1.
(a) What probability would you assign to an A being followed by a G? Under what assumptions does this seem appropriate?
(b) Given that there is an A at a given position along the E. coli genome, what probability would you assign to it being followed by a G, and why?
(c) Does the E. coli composition data suggest that the event we observe a G at one site is independent of the previous two bases? Explain fully with appropriate data.

Answers

(a) P(A being followed by G) = P(AG) = 0.056 (from the ba chart above). This is appropriate only if we assume the occurance of a dinucleotide is independent of occurances of other dinucleotide. Which technically isn't the case.

(b) P(G|A) = P(AG)/P(A) = 0.056/0.2483 = 0.2255

(c) To check if an event of occurance G is indepent of other events, it's enough to show that
P(G|E) = P(G) for any and every event.

P(G) = 0.2714 P(G|A) = 0.2255 P(G|C) = 0.31 which is a lot higher than P(G).
Therefore occurance of G is not independent of occurance or presence of of previous two bases.
Clearly the probability of occurance of G varies depending on if the preceeding base is an A or a C.

Question 4

For the given sequence (File: E. coli K-12 MG 1655), count number of purines (A,G) in each block (block size of 100,1000, and 10,000bp).
(a) Calculate the mean and standard deviation of the proportion of purines per block, and draw histograms of these numbers.
(b) Compare the results of (a) across the different block sizes and comment.
(c) For each block size calculate fraction of counts within 1, 2 and 3 standard deviations of the mean.

In [6]:
def isPurine(n):
    if ecoli[n] == 'A' or ecoli[n] == 'G':
        return True
    return False
In [7]:
count100 = np.zeros(len(ecoli)-100+1)
count1000 = np.zeros(len(ecoli)-1000+1)
count10000 = np.zeros(len(ecoli)-10000+1)

count = 0;
for i in range(100):
    if(isPurine(i)):
        count = count + 1
count100[0] = count
for i in range(100,len(ecoli)):
    if(isPurine(i-100)):
        count = count - 1
    if(isPurine(i)):
        count = count + 1
    count100[i+1-100] = count
print("count100: ",count100)

count = 0;
for i in range(1000):
    if(isPurine(i)):
        count = count + 1
count1000[0] = count
for i in range(1000,len(ecoli)):
    if(isPurine(i-1000)):
        count = count - 1
    if(isPurine(i)):
        count = count + 1
    count1000[i+1-1000] = count
print("count1000: ",count1000)

count = 0;
for i in range(10000):
    if(isPurine(i)):
        count = count + 1
count10000[0] = count
for i in range(10000,len(ecoli)):
    if(isPurine(i-10000)):
        count = count - 1
    if(isPurine(i)):
        count = count + 1
    count10000[i+1-10000] = count
print("count10000: ",count10000)
count100:  [49. 48. 49. ... 45. 45. 44.]
count1000:  [507. 507. 508. ... 518. 517. 517.]
count10000:  [5200. 5200. 5201. ... 5211. 5211. 5210.]
In [8]:
# Mean proportion of a block is nothing but the proportion of the block itself

proportion100 = count100/100
proportion1000 = count1000/1000
proportion10000 = count10000/10000

# Standard deviation of each sample is nothing but (pq/N)^0.5

stdev100 = [(proportion100[i]*(1-proportion100[i])/100)**0.5 for i in range(len(count100))]
stdev1000 = [(proportion1000[i]*(1-proportion1000[i])/100)**0.5 for i in range(len(count1000))]
stdev10000 = [(proportion10000[i]*(1-proportion10000[i])/100)**0.5 for i in range(len(count10000))]
In [9]:
# Plotting distribution of proportion means

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=proportion100,
    histnorm='probability',
    marker_color='#091b52',
    opacity=0.75
))

fig.update_layout(
    title_text='Distribution of proportion means N = 100', # title of plot
    xaxis_title_text='proportion', # xaxis label
    yaxis_title_text='Probability', # yaxis label
)

fig.show()

fig1 = go.Figure()
fig1.add_trace(go.Histogram(
    x=proportion1000,
    histnorm='probability',
    marker_color='#091b52',
    opacity=0.75
))

fig1.update_layout(
    title_text='Distribution of proportion means N = 1000', # title of plot
    xaxis_title_text='proportion', # xaxis label
    yaxis_title_text='Probability', # yaxis label
)

fig1.show()

fig2 = go.Figure()
fig2.add_trace(go.Histogram(
    x=proportion10000,
    histnorm='probability',
    marker_color='#091b52',
    opacity=0.75
))

fig2.update_layout(
    title_text='Distribution of proportion means N = 10000', # title of plot
    xaxis_title_text='proportion', # xaxis label
    yaxis_title_text='Probability', # yaxis label
)

fig2.show()

Comments

When comparing the three graphs obtained, we can see that as the sample size increases the proportion means seem to converge to 0.5 i.e. half. Also we see that the range of proportion values decrease, in the last case it never happens that there exists a block with less than 0.5.

Theoretically this would mean that the probability of finding a piridine converges to 0.5.

In [10]:
# Plotting distribution of proportion means

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=stdev100,
    histnorm='probability',
    marker_color='#091b52',
    opacity=0.75
))

fig.update_layout(
    title_text='Distribution of block standard deviations N = 100', # title of plot
    xaxis_title_text='standard deviation', # xaxis label
    yaxis_title_text='Probability', # yaxis label
)

fig.show()

fig1 = go.Figure()
fig1.add_trace(go.Histogram(
    x=stdev1000,
    histnorm='probability',
    marker_color='#091b52',
    opacity=0.75
))

fig1.update_layout(
    title_text='Distribution of block standard deviations N = 1000', # title of plot
    xaxis_title_text='standard deviation', # xaxis label
    yaxis_title_text='Probability', # yaxis label
)

fig1.show()

fig2 = go.Figure()
fig2.add_trace(go.Histogram(
    x=stdev10000,
    histnorm='probability',
    marker_color='#091b52',
    opacity=0.75
))

fig2.update_layout(
    title_text='Distribution of block standard deviations N = 10000', # title of plot
    xaxis_title_text='standard deviation', # xaxis label
    yaxis_title_text='Probability', # yaxis label
)

fig2.show()

Comments

From the standard deviation graphs, we can see that the value seems to be converging and the range of possible standard deviations are decreasing but not as rampant as that of the mean.

In [11]:
# Calculating sampling distibution means and standard deviations

mean_100 = np.mean(proportion100)
mean_1000 = np.mean(proportion1000)
mean_10000 = np.mean(proportion10000)

stdev_100 = statistics.stdev(proportion100)
stdev_1000 = statistics.stdev(proportion1000)
stdev_10000 = statistics.stdev(proportion10000)

count_100_1 = 0
count_100_2 = 0
count_100_3 = 0

count_1000_1 = 0
count_1000_2 = 0
count_1000_3 = 0

count_10000_1 = 0
count_10000_2 = 0
count_10000_3 = 0

for i in proportion100:
    if mean_100 - 3*stdev_100 < i < mean_100 + 3*stdev_100:
        count_100_3 = count_100_3 + 1
    if mean_100 - 2*stdev_100 < i < mean_100 + 2*stdev_100:
        count_100_2 = count_100_2 + 1
    if mean_100 - 1*stdev_100 < i < mean_100 + 1*stdev_100:
        count_100_1 = count_100_1 + 1
        
for i in proportion1000:
    if mean_1000 - 3*stdev_1000 < i < mean_1000 + 3*stdev_1000:
        count_1000_3 = count_1000_3 + 1
    if mean_1000 - 2*stdev_1000 < i < mean_1000 + 2*stdev_1000:
        count_1000_2 = count_1000_2 + 1
    if mean_1000 - 1*stdev_1000 < i < mean_1000 + 1*stdev_1000:
        count_1000_1 = count_1000_1 + 1

for i in proportion10000:
    if mean_10000 - 3*stdev_10000 < i < mean_10000 + 3*stdev_10000:
        count_10000_3 = count_10000_3 + 1
    if mean_10000 - 2*stdev_10000 < i < mean_10000 + 2*stdev_10000:
        count_10000_2 = count_10000_2 + 1
    if mean_10000 - 1*stdev_10000 < i < mean_10000 + 1*stdev_10000:
        count_10000_1 = count_10000_1 + 1

print("Block Size: 100 ")
print("Fraction within 1 Stdev: ", count_100_1/len(count100))
print("Fraction within 2 Stdev: ", count_100_2/len(count100))
print("Fraction within 3 Stdev: ", count_100_3/len(count100))
print("Block Size: 1000 ")
print("Fraction within 1 Stdev: ", count_1000_1/len(count1000))
print("Fraction within 2 Stdev: ", count_1000_2/len(count1000))
print("Fraction within 3 Stdev: ", count_1000_3/len(count1000))
print("Block Size: 10000 ")
print("Fraction within 1 Stdev: ", count_10000_1/len(count10000))
print("Fraction within 1 Stdev: ", count_10000_2/len(count10000))
print("Fraction within 1 Stdev: ", count_10000_3/len(count10000))
Block Size: 100 
Fraction within 1 Stdev:  0.6490230254105128
Fraction within 2 Stdev:  0.9732134666544354
Fraction within 3 Stdev:  0.9987157141546648
Block Size: 1000 
Fraction within 1 Stdev:  0.7122287771222878
Fraction within 2 Stdev:  0.9336066393360664
Fraction within 3 Stdev:  1.0
Block Size: 10000 
Fraction within 1 Stdev:  0.6133866133866134
Fraction within 1 Stdev:  0.983016983016983
Fraction within 1 Stdev:  1.0
In [62]:
 
Out[62]:
0.0005408299021106148
In [ ]: